In [1]:
## 設定
verbose = False
### 言語の割合の均等化
balanced = True
### LDA 用
## トピック数
n_topics = 20 # 30は多過ぎる?
## doc, term の設定
doc_type = 'form'
doc_attr = 'spell'
max_doc_size = 12
##
term_size = 'character'
term_type = 'skippy2gram'
## skippy n-gram の結合範囲
max_distance_val = round(max_doc_size * 0.8)
print(f"max_distance_val: {max_distance_val}")
## ngram を包括的にするかどうか
ngram_is_inclusive = True
### DTM 構築
## term の最低頻度
term_min_freq = 2
## 高頻度 term の濫用指標: 大きくし過ぎないように.0.05 は十分に大きい
term_abuse_threshold = 0.04
max_distance_val: 10
In [2]:
import sys, os, random, re, glob
import pandas as pd
import pprint as pp
from functools import reduce
In [3]:
## load data to process
from pathlib import Path
import pprint as pp
wd = Path(".")
dirs = [ x for x in wd.iterdir() if x.is_dir() and not x.match(r"plot*") ]
if verbose:
print(f"The following {len(dirs)} directories are potential targets:")
pp.pprint(dirs)
## list up files in target directory
wd = Path(".")
target_dir = "data-words" # can be changed
target_files = sorted(list(wd.glob(f"{target_dir}/*.csv")))
#
print(f"\n{target_dir} contains {len(target_files)} files to process")
pp.pprint(target_files)
data-words contains 20 files to process
[PosixPath('data-words/base-sound-English-r6e-originals.csv'),
PosixPath('data-words/base-sound-French-r0-opendic-s900.csv'),
PosixPath('data-words/base-sound-German-r1a-original.csv'),
PosixPath('data-words/base-spell-Arabic-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Chinese-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-English-r6e-originals.csv'),
PosixPath('data-words/base-spell-Esperanto-r0-orginal.csv'),
PosixPath('data-words/base-spell-Finnish-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-French-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-German-r1a-originals.csv'),
PosixPath('data-words/base-spell-Greek-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Hungarian-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Icelandic-r0-original.csv'),
PosixPath('data-words/base-spell-Irish-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Italian-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Japanese-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Russian-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Spanish-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Swahili-r0-1k-mc.csv'),
PosixPath('data-words/base-spell-Turkish-r0-1k-mc.csv')]
In [4]:
import pandas as pd
## データ型の辞書
types = re.split(r",\s+", "spell, sound, freq")
type_settings = { t : 0 for t in types }
print(type_settings)
## 言語名の辞書
lang_name_text = "arabic, bengali, chinese, english, esperanto, finnish, french, \
greek, galician, german, hungarian, icelandic, irish, italian, japanese, russian, spanish, swahili, turkish"
langs = re.split(r",\s*", lang_name_text)
#langs = "english esperanto french german russian swahili".split(" ")
#langs = "english esperanto french german icelandic swahili".split(" ")
lang_settings = { lang : 0 for lang in langs }
print(f"{len(lang_settings.keys())} langs are available")
print(lang_settings)
## 辞書と統合
settings = { 'form': None, **type_settings, **lang_settings }
print(settings)
{'spell': 0, 'sound': 0, 'freq': 0}
19 langs are available
{'arabic': 0, 'bengali': 0, 'chinese': 0, 'english': 0, 'esperanto': 0, 'finnish': 0, 'french': 0, 'greek': 0, 'galician': 0, 'german': 0, 'hungarian': 0, 'icelandic': 0, 'irish': 0, 'italian': 0, 'japanese': 0, 'russian': 0, 'spanish': 0, 'swahili': 0, 'turkish': 0}
{'form': None, 'spell': 0, 'sound': 0, 'freq': 0, 'arabic': 0, 'bengali': 0, 'chinese': 0, 'english': 0, 'esperanto': 0, 'finnish': 0, 'french': 0, 'greek': 0, 'galician': 0, 'german': 0, 'hungarian': 0, 'icelandic': 0, 'irish': 0, 'italian': 0, 'japanese': 0, 'russian': 0, 'spanish': 0, 'swahili': 0, 'turkish': 0}
In [5]:
vars = list(settings.keys())
print(f"targe var names: {vars}")
d_parts = [ ]
for lang in langs:
local_settings = settings.copy()
if check:
print(f"processing: {lang}")
try:
for f in [ f for f in target_files if lang.capitalize() in str(f) ]:
print(f"reading: {f}")
# 言語名の指定
local_settings[lang] = 1
# 型名の指定
for type in vars:
if type in str(f):
local_settings[type] = 1
#
try:
d = pd.read_csv(f, encoding='utf-8', sep = ",", on_bad_lines = 'skip') # Crucially, ...= skip
df = pd.DataFrame(d, columns = vars)
for var in [ var for var in (types + langs) if var != 'freq' ]:
df[var] = local_settings[var]
d_parts.append(df)
except FileNotFoundError:
pass
except IndexError:
pass
#
if verbose:
d_parts
targe var names: ['form', 'spell', 'sound', 'freq', 'arabic', 'bengali', 'chinese', 'english', 'esperanto', 'finnish', 'french', 'greek', 'galician', 'german', 'hungarian', 'icelandic', 'irish', 'italian', 'japanese', 'russian', 'spanish', 'swahili', 'turkish'] processing: arabic reading: data-words/base-spell-Arabic-r0-1k-mc.csv processing: bengali processing: chinese reading: data-words/base-spell-Chinese-r0-1k-mc.csv processing: english reading: data-words/base-sound-English-r6e-originals.csv reading: data-words/base-spell-English-r6e-originals.csv processing: esperanto reading: data-words/base-spell-Esperanto-r0-orginal.csv processing: finnish reading: data-words/base-spell-Finnish-r0-1k-mc.csv processing: french reading: data-words/base-sound-French-r0-opendic-s900.csv reading: data-words/base-spell-French-r0-1k-mc.csv processing: greek reading: data-words/base-spell-Greek-r0-1k-mc.csv processing: galician processing: german reading: data-words/base-sound-German-r1a-original.csv reading: data-words/base-spell-German-r1a-originals.csv processing: hungarian reading: data-words/base-spell-Hungarian-r0-1k-mc.csv processing: icelandic reading: data-words/base-spell-Icelandic-r0-original.csv processing: irish reading: data-words/base-spell-Irish-r0-1k-mc.csv processing: italian reading: data-words/base-spell-Italian-r0-1k-mc.csv processing: japanese reading: data-words/base-spell-Japanese-r0-1k-mc.csv processing: russian reading: data-words/base-spell-Russian-r0-1k-mc.csv processing: spanish reading: data-words/base-spell-Spanish-r0-1k-mc.csv processing: swahili reading: data-words/base-spell-Swahili-r0-1k-mc.csv processing: turkish reading: data-words/base-spell-Turkish-r0-1k-mc.csv
In [6]:
## データ統合
raw_df = pd.concat(d_parts)
raw_df
Out[6]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | german | hungarian | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | كما | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | أنا | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | له | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | أن | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | هو | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 995 | çoğul | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 996 | öfke | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 997 | iddia | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 998 | kıta | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
25217 rows × 23 columns
In [7]:
## 文字数の列を追加
raw_df['size'] = [ len(x) for x in raw_df[doc_type] ]
raw_df
Out[7]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | hungarian | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | size | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | كما | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| 1 | أنا | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| 2 | له | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 3 | أن | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 4 | هو | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 |
| 995 | çoğul | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 |
| 996 | öfke | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 |
| 997 | iddia | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 |
| 998 | kıta | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 |
25217 rows × 24 columns
In [8]:
## 言語名= language の列を追加
check = False
language_vals = [ ]
for i, row in raw_df.iterrows():
if check:
print(row)
for j, lang in enumerate(langs):
if check:
print(f"{i}: {lang}")
if row[lang] == 1:
language_vals.append(lang)
if verbose:
print(language_vals)
len(language_vals)
#
raw_df['language'] = language_vals
raw_df
Out[8]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | size | language | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | كما | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | arabic |
| 1 | أنا | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | arabic |
| 2 | له | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | arabic |
| 3 | أن | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | arabic |
| 4 | هو | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | arabic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 995 | çoğul | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 996 | öfke | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
| 997 | iddia | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 998 | kıta | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
25217 rows × 25 columns
In [9]:
## 言語の選別
select_languages = True
#selected_langs = re.split(r",\s*", "english, french, german, russian, swahili")
selected_langs = re.split(r",\s*",
"arabic, bengali, chinese, english, french, german, \
greek, hungarian, russian, japanese, turkish")
print(f"selected languages: {selected_langs}")
if select_languages:
df_new = [ ]
for lang in selected_langs:
df_new.append(raw_df[raw_df[lang] == 1])
raw_df = pd.concat(df_new)
#
raw_df
selected languages: ['arabic', 'bengali', 'chinese', 'english', 'french', 'german', 'greek', 'hungarian', 'russian', 'japanese', 'turkish']
Out[9]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | size | language | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | كما | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | arabic |
| 1 | أنا | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | arabic |
| 2 | له | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | arabic |
| 3 | أن | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | arabic |
| 4 | هو | 1 | 0 | 1.0 | 1 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | arabic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 995 | çoğul | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 996 | öfke | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
| 997 | iddia | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 998 | kıta | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
18963 rows × 25 columns
In [10]:
## 文字数の分布
import numpy as np
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.hist(raw_df['size'], bins = 40)
ax.set_xlabel('length of doc')
ax.set_ylabel('freq')
plt.title(f"Length distribution for docs")
fig.show()
/var/folders/s2/lk8hdt6j10j0xyycw1lbjsm40000gn/T/ipykernel_69561/1088473461.py:12: UserWarning: FigureCanvasAgg is non-interactive, and thus cannot be shown fig.show()
In [11]:
## 長さで濾過
print(f"max doc size: {max_doc_size}")
original_size = len(raw_df)
raw_df = raw_df[raw_df['size'] < max_doc_size]
filtered_size = len(raw_df)
print(f"{original_size - filtered_size} cases removed")
max doc size: 12 414 cases removed
In [12]:
## 結果の検査 1
for lang in langs:
print(raw_df[lang].value_counts())
arabic 0 17554 1 995 Name: count, dtype: int64 bengali 0 18549 Name: count, dtype: int64 chinese 0 17549 1 1000 Name: count, dtype: int64 english 0 10300 1 8249 Name: count, dtype: int64 esperanto 0 18549 Name: count, dtype: int64 finnish 0 18549 Name: count, dtype: int64 french 0 16733 1 1816 Name: count, dtype: int64 greek 0 17575 1 974 Name: count, dtype: int64 galician 0 18549 Name: count, dtype: int64 german 0 16984 1 1565 Name: count, dtype: int64 hungarian 0 17558 1 991 Name: count, dtype: int64 icelandic 0 18549 Name: count, dtype: int64 irish 0 18549 Name: count, dtype: int64 italian 0 18549 Name: count, dtype: int64 japanese 0 17549 1 1000 Name: count, dtype: int64 russian 0 17576 1 973 Name: count, dtype: int64 spanish 0 18549 Name: count, dtype: int64 swahili 0 18549 Name: count, dtype: int64 turkish 0 17563 1 986 Name: count, dtype: int64
In [13]:
## 結果の検査 2
for type in types:
print(raw_df[type].value_counts())
spell 1 12829 0 5720 Name: count, dtype: int64 sound 1 11630 0 6919 Name: count, dtype: int64 freq 1.0 17569 1 966 1 не 1 1 то время как 1 1 северу 1 1 него 1 1 будет 1 1 образом 1 1 мышь 1 Name: count, dtype: int64
In [14]:
## 統合: 割合補正を適用
eng_reduct_factor = 0.2
if balanced:
eng_df = raw_df[raw_df['english'] == 1]
non_eng_df = raw_df[raw_df['english'] == 0]
eng_reduced_df = eng_df.sample(round(len(eng_df) * eng_reduct_factor))
raw_df = pd.concat([eng_reduced_df, non_eng_df])
raw_df
Out[14]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | size | language | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3995 | ɹʌbɪʃ | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | english |
| 3467 | skill | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | english |
| 3236 | row | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | english |
| 1398 | kɔɹnkɹɪb | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | english |
| 641 | dʒɛntlmən | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 9 | english |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 995 | çoğul | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 996 | öfke | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
| 997 | iddia | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 998 | kıta | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
11950 rows × 25 columns
In [15]:
## データの指定
raw_df = raw_df[raw_df[doc_attr] == 1]
print(f"doc_attr: {doc_attr}")
raw_df
doc_attr: spell
Out[15]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | icelandic | irish | italian | japanese | russian | spanish | swahili | turkish | size | language | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3467 | skill | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | english |
| 3236 | row | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | english |
| 2255 | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | english | |
| 3637 | straighten | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 10 | english |
| 953 | deformation | 1 | 1 | 1.0 | 0 | 0 | 0 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | english |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 994 | burun | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 995 | çoğul | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 996 | öfke | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
| 997 | iddia | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish |
| 998 | kıta | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 4 | turkish |
9513 rows × 25 columns
In [16]:
## 結果の検査 3
for lang in langs:
print(raw_df[lang].value_counts())
arabic 0 8518 1 995 Name: count, dtype: int64 bengali 0 9513 Name: count, dtype: int64 chinese 0 8513 1 1000 Name: count, dtype: int64 english 0 8681 1 832 Name: count, dtype: int64 esperanto 0 9513 Name: count, dtype: int64 finnish 0 9513 Name: count, dtype: int64 french 0 8528 1 985 Name: count, dtype: int64 greek 0 8539 1 974 Name: count, dtype: int64 galician 0 9513 Name: count, dtype: int64 german 0 8736 1 777 Name: count, dtype: int64 hungarian 0 8522 1 991 Name: count, dtype: int64 icelandic 0 9513 Name: count, dtype: int64 irish 0 9513 Name: count, dtype: int64 italian 0 9513 Name: count, dtype: int64 japanese 0 8513 1 1000 Name: count, dtype: int64 russian 0 8540 1 973 Name: count, dtype: int64 spanish 0 9513 Name: count, dtype: int64 swahili 0 9513 Name: count, dtype: int64 turkish 0 8527 1 986 Name: count, dtype: int64
解析¶
In [17]:
## 順序のランダマイズし,基本データを決定
import sklearn.utils
df = sklearn.utils.shuffle(raw_df)
In [18]:
## ngram の追加
import sys
sys.path.append('..')
import re
import ngrams
import importlib
importlib.reload(ngrams)
import ngrams_skippy
bases = df[doc_type]
## 1gram 列の追加
#sep = r""
#unigrams = [ list(filter(lambda x: len(x) > 0, y)) for y in [ re.split(sep, z) for z in bases ] ]
unigrams = ngrams.gen_unigrams(bases, sep = r"", check = False)
if verbose:
random.sample(unigrams, 5)
#
df['1gram'] = unigrams
#df.loc[:,'1gram'] = unigrams
df
Out[18]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | irish | italian | japanese | russian | spanish | swahili | turkish | size | language | 1gram | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 361 | глубоко | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 7 | russian | [г, л, у, б, о, к, о] |
| 333 | 完全な | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 3 | japanese | [完, 全, な] |
| 164 | そう | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | japanese | [そ, う] |
| 61 | maison | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | french | [m, a, i, s, o, n] |
| 30 | alle | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | german | [a, l, l, e] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 227 | arbre | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | french | [a, r, b, r, e] |
| 595 | virág | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | hungarian | [v, i, r, á, g] |
| 829 | horloge | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | french | [h, o, r, l, o, g, e] |
| 998 | 爱 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | chinese | [爱] |
| 386 | sıcak | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | turkish | [s, ı, c, a, k] |
9513 rows × 26 columns
In [19]:
## 2gram列の追加
bigrams = ngrams.gen_bigrams(bases, sep = r"", check = False)
## 包括的 2gram の作成
if ngram_is_inclusive:
bigrams = [ [*b, *u] for b, u in zip(bigrams, unigrams) ]
if verbose:
print(random.sample(bigrams, 3))
In [20]:
df['2gram'] = bigrams
if verbose:
df
In [21]:
## 3gram列の追加
trigrams = ngrams.gen_trigrams(bases, sep = r"", check = False)
## 包括的 3gram の作成
if ngram_is_inclusive:
trigrams = [ [ *t, *b ] for t, b in zip(trigrams, bigrams) ]
if verbose:
print(random.sample(trigrams, 3))
In [22]:
df['3gram'] = trigrams
if verbose:
df
In [23]:
## skippy 2grams の生成
import sys
sys.path.append("..") # library path に一つ上の階層を追加
import ngrams_skippy
skippy_2grams = [ ngrams_skippy.generate_skippy_bigrams(x,
missing_mark = '…',
max_distance = max_distance_val, check = False)
for x in df['1gram'] ]
## 包括的 skippy 2-grams の生成
if ngram_is_inclusive:
for i, b2 in enumerate(skippy_2grams):
b2.extend(unigrams[i])
#
if verbose:
random.sample(skippy_2grams, 3)
In [24]:
## skippy 2gram 列の追加
df['skippy2gram'] = skippy_2grams
df
Out[24]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | russian | spanish | swahili | turkish | size | language | 1gram | 2gram | 3gram | skippy2gram | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 361 | глубоко | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 7 | russian | [г, л, у, б, о, к, о] | [гл, лу, уб, бо, ок, ко, г, л, у, б, о, к, о] | [глу, луб, убо, бок, око, гл, лу, уб, бо, ок, ... | [гл, г…у, г…б, г…о, г…к, лу, л…б, л…о, л…к, уб... |
| 333 | 完全な | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 3 | japanese | [完, 全, な] | [完全, 全な, 完, 全, な] | [完全な, 完全, 全な, 完, 全, な] | [完全, 完…な, 全な, 完, 全, な] |
| 164 | そう | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 2 | japanese | [そ, う] | [そう, そ, う] | [そ, う, そう, そ, う] | [そう, そ, う] |
| 61 | maison | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 6 | french | [m, a, i, s, o, n] | [ma, ai, is, so, on, m, a, i, s, o, n] | [mai, ais, iso, son, ma, ai, is, so, on, m, a,... | [ma, m…i, m…s, m…o, m…n, ai, a…s, a…o, a…n, is... |
| 30 | alle | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 4 | german | [a, l, l, e] | [al, ll, le, a, l, l, e] | [all, lle, al, ll, le, a, l, l, e] | [al, a…l, a…e, ll, l…e, le, a, l, l, e] |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 227 | arbre | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 5 | french | [a, r, b, r, e] | [ar, rb, br, re, a, r, b, r, e] | [arb, rbr, bre, ar, rb, br, re, a, r, b, r, e] | [ar, a…b, a…r, a…e, rb, r…r, r…e, br, b…e, re,... |
| 595 | virág | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 5 | hungarian | [v, i, r, á, g] | [vi, ir, rá, ág, v, i, r, á, g] | [vir, irá, rág, vi, ir, rá, ág, v, i, r, á, g] | [vi, v…r, v…á, v…g, ir, i…á, i…g, rá, r…g, ág,... |
| 829 | horloge | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 7 | french | [h, o, r, l, o, g, e] | [ho, or, rl, lo, og, ge, h, o, r, l, o, g, e] | [hor, orl, rlo, log, oge, ho, or, rl, lo, og, ... | [ho, h…r, h…l, h…o, h…g, h…e, or, o…l, o…o, o…... |
| 998 | 爱 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | chinese | [爱] | [爱, 爱] | [爱, 爱, 爱] | [爱, 爱] |
| 386 | sıcak | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 5 | turkish | [s, ı, c, a, k] | [sı, ıc, ca, ak, s, ı, c, a, k] | [sıc, ıca, cak, sı, ıc, ca, ak, s, ı, c, a, k] | [sı, s…c, s…a, s…k, ıc, ı…a, ı…k, ca, c…k, ak,... |
9513 rows × 29 columns
In [25]:
## skippy 3grams の生成
#import sys
#sys.path.append("..") # library path に一つ上の階層を追加
import ngrams_skippy
skippy_3grams = [ ngrams_skippy.generate_skippy_trigrams(x,
missing_mark = '…',
max_distance = max_distance_val, check = False)
for x in df['1gram'] ]
## 包括的 skippy 3-grams の生成
if ngram_is_inclusive:
for i, t2 in enumerate(skippy_3grams):
t2.extend(skippy_2grams[i])
#
if verbose:
random.sample(skippy_3grams, 3)
In [26]:
## skippy 3gram 列の追加
df['skippy3gram'] = skippy_3grams
df
Out[26]:
| form | spell | sound | freq | arabic | bengali | chinese | english | esperanto | finnish | ... | spanish | swahili | turkish | size | language | 1gram | 2gram | 3gram | skippy2gram | skippy3gram | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 361 | глубоко | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 7 | russian | [г, л, у, б, о, к, о] | [гл, лу, уб, бо, ок, ко, г, л, у, б, о, к, о] | [глу, луб, убо, бок, око, гл, лу, уб, бо, ок, ... | [гл, г…у, г…б, г…о, г…к, лу, л…б, л…о, л…к, уб... | [глу, гл…б, гл…о, гл…к, г…уб, г…у…о, г…у…к, г…... |
| 333 | 完全な | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 3 | japanese | [完, 全, な] | [完全, 全な, 完, 全, な] | [完全な, 完全, 全な, 完, 全, な] | [完全, 完…な, 全な, 完, 全, な] | [完全な, 完全, 完…な, 全な, 完, 全, な] |
| 164 | そう | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 2 | japanese | [そ, う] | [そう, そ, う] | [そ, う, そう, そ, う] | [そう, そ, う] | [そう, そう, そ, う] |
| 61 | maison | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 6 | french | [m, a, i, s, o, n] | [ma, ai, is, so, on, m, a, i, s, o, n] | [mai, ais, iso, son, ma, ai, is, so, on, m, a,... | [ma, m…i, m…s, m…o, m…n, ai, a…s, a…o, a…n, is... | [mai, ma…s, ma…o, ma…n, m…is, m…i…o, m…i…n, m…... |
| 30 | alle | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 4 | german | [a, l, l, e] | [al, ll, le, a, l, l, e] | [all, lle, al, ll, le, a, l, l, e] | [al, a…l, a…e, ll, l…e, le, a, l, l, e] | [all, al…e, a…le, lle, al, a…l, a…e, ll, l…e, ... |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 227 | arbre | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 5 | french | [a, r, b, r, e] | [ar, rb, br, re, a, r, b, r, e] | [arb, rbr, bre, ar, rb, br, re, a, r, b, r, e] | [ar, a…b, a…r, a…e, rb, r…r, r…e, br, b…e, re,... | [arb, ar…r, ar…e, a…br, a…b…e, a…re, rbr, rb…e... |
| 595 | virág | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 5 | hungarian | [v, i, r, á, g] | [vi, ir, rá, ág, v, i, r, á, g] | [vir, irá, rág, vi, ir, rá, ág, v, i, r, á, g] | [vi, v…r, v…á, v…g, ir, i…á, i…g, rá, r…g, ág,... | [vir, vi…á, vi…g, v…rá, v…r…g, v…ág, irá, ir…g... |
| 829 | horloge | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 7 | french | [h, o, r, l, o, g, e] | [ho, or, rl, lo, og, ge, h, o, r, l, o, g, e] | [hor, orl, rlo, log, oge, ho, or, rl, lo, og, ... | [ho, h…r, h…l, h…o, h…g, h…e, or, o…l, o…o, o…... | [hor, ho…l, ho…o, ho…g, ho…e, h…rl, h…r…o, h…r... |
| 998 | 爱 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 1 | chinese | [爱] | [爱, 爱] | [爱, 爱, 爱] | [爱, 爱] | [爱, 爱, 爱] |
| 386 | sıcak | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 5 | turkish | [s, ı, c, a, k] | [sı, ıc, ca, ak, s, ı, c, a, k] | [sıc, ıca, cak, sı, ıc, ca, ak, s, ı, c, a, k] | [sı, s…c, s…a, s…k, ıc, ı…a, ı…k, ca, c…k, ak,... | [sıc, sı…a, sı…k, s…ca, s…c…k, s…ak, ıca, ıc…k... |
9513 rows × 30 columns
In [27]:
## LDA 構築の基になる document-term matrix (dtm) を構築
from gensim.corpora.dictionary import Dictionary
bots = df[term_type]
diction = Dictionary(bots)
## 結果の確認
print(diction)
Dictionary<10059 unique tokens: ['б', 'бо', 'б…к', 'б…о', 'г']...>
In [28]:
## diction の濾過
import copy
diction_copy = copy.deepcopy(diction)
## filter適用: 実は諸刃の刃で,token数が少ない時には適用しない方が良い
print(f"min freq filter: {term_min_freq}")
print(f"abuse filter: {term_abuse_threshold}")
apply_filter = True
if apply_filter:
diction_copy.filter_extremes(no_below = term_min_freq, no_above = term_abuse_threshold)
## check
print(diction_copy)
min freq filter: 2 abuse filter: 0.04 Dictionary<6173 unique tokens: ['б', 'бо', 'б…к', 'б…о', 'г']...>
In [29]:
## Corpus (gensim の用語では corpus) の構築
corpus = [ diction.doc2bow(bot) for bot in bots ]
## check
check = True
if verbose:
sample_n = 5
print(random.sample(corpus, sample_n))
#
print(f"Number of documents: {len(corpus)}")
Number of documents: 9513
In [30]:
## LDA モデルの構築
from gensim.models import LdaModel
#from tqdm import tqdm
## LDAモデル
print(f"Building LDA model with n_topics: {n_topics}")
lda = LdaModel(corpus, id2word = diction, num_topics = n_topics, alpha = 0.01)
#
print(lda) # print(..)しないと中身が見れない
Building LDA model with n_topics: 20 LdaModel<num_terms=10059, num_topics=20, decay=0.5, chunksize=2000>
In [31]:
%%capture --no-display
## LDA のtopic ごとに,関連度の高い term を表示
import pandas as pd
n_terms = 20 # topic ごとに表示する term 数の指定
topic_dfs = [ ]
for topic in range(n_topics):
terms = [ ]
for i, prob in lda.get_topic_terms(topic, topn = n_terms):
terms.append(diction.id2token[ int(i) ])
#
topic_dfs.append(pd.DataFrame([terms], index = [ f'topic {topic+1}' ]))
#
topic_term_df = pd.concat(topic_dfs)
## Table で表示
topic_term_df.T
Out[31]:
| topic 1 | topic 2 | topic 3 | topic 4 | topic 5 | topic 6 | topic 7 | topic 8 | topic 9 | topic 10 | topic 11 | topic 12 | topic 13 | topic 14 | topic 15 | topic 16 | topic 17 | topic 18 | topic 19 | topic 20 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ε | ε | ب | t | о | u | h | e | é | α | t | م | ν | o | o | a | α | ا | о | ا |
| 1 | ο | σ | è | e | а | e | e | s | e | μ | z | ر | ي | r | p | l | ο | ل | и | ل |
| 2 | ι | σ…ε | r | r | е | m | z | n | t | η | f | ة | ق | l | o…o | k | τ | ال | с | ي |
| 3 | λ | т | ue | p | л | t | n | i | l | σ | s | ا | ة | h | co | r | γ | و | т | ن |
| 4 | π | τ | v | a | н | i | u | g | é…e | ά | e | ت | α | e | y | e | ν | ع | а | ال |
| 5 | κ | ρ | u…r | n | р | ü | w | e…e | j | ρ | g | d | ώ | t | i | i | ρ | e | ть | ك |
| 6 | ρ | т…и | è…e | a…t | к | c | r | r | el | μα | g…t | د | ο | a | r | a…a | υ | ة | д | ب |
| 7 | α | τε | o | r…e | т | r | á | s…e | él | τ | sz | خ | νο | v | g | a…e | е | ب | в | ج |
| 8 | ί | ερ | ラ | er | с | o | s | en | та | ση | é | ن | α…ν | s | po | ar | т | ر | ь | a |
| 9 | ς | ε…ο | e | t…n | п | l | l | m | é…t | δ | r | ع | ي…ة | y | i…o | d | м | d | о…а | م |
| 10 | a | σ…ί | p…p | a…e | и | m…e | u…r | s…n | le | ί | k | ط | س…ة | o…e | or | c | έ | b | а…ь | ا…ي |
| 11 | τ | н…т | イ | te | в | me | i | e…n | et | μ…α | e…t | ي | س | f | o…e | n | ή | l | о…т | t |
| 12 | ά | στ | é…o | p…e | у | o…e | h…n | n…e | e…e | ω | i | h | ι | or | c | m | м…т | ح | с…т | ر |
| 13 | ό | σ…τ | м…а | t…e | д | c…e | d | i…e | je | ά…α | e…z | م…ة | し | h…t | r…o | b | い | m…g | к…а | ل…ن |
| 14 | χ | ρ…σ | u | p…r | м | o…u | u…e | t | e…t | κ | r…z | o…d | ν…ι | h…e | o…r | ı | т…н | م | φ | ا…ا |
| 15 | μ | т…а | ル | i | о…о | u…e | w…e | e…i | ec | π | o | م…ا | να | fo | e…o | a…l | λ | ا…ة | о…и | ク |
| 16 | ν | ε…ε | ث | c | ь | m…t | h…g | se | té | ό | f…i | ر…ة | ν…ν | る | o…y | r…a | с | ا…ا | ат | ه |
| 17 | ε…ι | ть | к…ы | t…r | е…е | n | g | i…n | r | ή | et | f…d | μ…ν | o…r | gy | ma | υν | ق | и…ь | د |
| 18 | π…ο | т…т | q…e | e…e | г | i…e | h…e | in | l…t | ρα | z…t | ج | 不 | er | yo | la | σ…ν | m | р | ا…ن |
| 19 | ύ | ί…ε | q | re | ер | ü…e | un | s…i | t…e | τ…α | s…t | uc | ق…ة | o…t | og | a…n | ου | ت | р…т | ل…ا |
In [32]:
%%capture --no-display
## pyLDAvis を使った結果 LDA の可視化: 階層クラスタリングより詳しい
import pyLDAvis
#installed_version = sys.version
installed_version = pyLDAvis.__version__
print(f"installed_version: {installed_version}")
if float(installed_version[:3]) > 3.1:
import pyLDAvis.gensim_models as gensimvis
else:
import pyLDAvis.gensim as gensimvis
#
pyLDAvis.enable_notebook()
#
lda_used = lda
corpus_used = corpus
diction_used = diction
## 実行パラメター
use_tSNE = False
if use_tSNE:
vis = gensimvis.prepare(lda_used, corpus_used, diction_used, mds = 'tsne',
n_jobs = 1, sort_topics = False)
else:
vis = gensimvis.prepare(lda_used, corpus_used, diction_used,
n_jobs = 1, sort_topics = False)
#
pyLDAvis.display(vis)
## topic を表わす円の重なりが多いならn_topics が多過ぎる可能性がある.
## ただし2Dで重なっていても,3Dなら重なっていない可能性もある
Out[32]:
In [33]:
## LDA がD に対して生成した topics の弁別性を確認
## 得られたtopics を確認
topic_dist = lda.get_topics()
if verbose:
topic_dist
In [34]:
## 検査 1: topic ごとに分布の和を取る
print(topic_dist.sum(axis = 1))
[1.0000001 1. 1. 1. 0.9999999 1.0000001 1. 0.99999994 1. 1. 1. 0.9999999 0.99999994 1. 0.9999999 1. 0.99999994 1.0000001 0.99999994 0.99999994]
In [35]:
## 検査 2: 総和を求める: n_topics にほぼ等しいなら正常
print(topic_dist.sum())
20.0
In [36]:
## term エンコード値の分布を確認
import matplotlib.pyplot as plt
plt.figure(figsize = (4,5))
sampling_rate = 0.3
df_size = len(topic_dist)
sample_n = round(df_size * sampling_rate)
topic_sampled = random.sample(list(topic_dist), sample_n)
T = sorted([ sorted(x, reverse = True) for x in topic_sampled ])
plt.plot(T, range(len(T)))
plt.title("Distribution of sorted values ({sample_n} samples) for topic/term encoding")
plt.show()
In [37]:
## tSNE を使った topics のグループ化 (3D)
from sklearn.manifold import TSNE
import numpy as np
## tSNE のパラメターを設定
## n_components は射影先の空間の次元: n_components = 3 なら3次元空間に射影
## perplexity は結合の強さを表わす指数で,値に拠って結果が代わるので,色々な値を試すと良い
#perplexity_val = 10 # 大き過ぎると良くない
top_perplexity_reduct_rate = 0.3
perplexity_val = round(len(topic_dist) * top_perplexity_reduct_rate)
topic_tSNE_3d = TSNE(n_components = 3, random_state = 0, perplexity = perplexity_val, n_iter = 1000)
## データに適用
top_tSNE_3d_fitted = topic_tSNE_3d.fit_transform(np.array(topic_dist))
In [38]:
## Plotlyを使って tSNE の結果の可視化 (3D)
#import plotly.express as pex
import plotly.graph_objects as go
import numpy as np
top_tSNE = top_tSNE_3d_fitted
fig = go.Figure(data = [go.Scatter3d(x = top_tSNE[:,0], y = top_tSNE[:,1], z = top_tSNE[:,2],
mode = 'markers')])
## 3D 散布図にラベルを追加する処理は未実装
title_val = f"3D tSNE view for LDA (#topics: {n_topics}, doc: {doc_type}, term: {term_type})"
fig.update_layout(autosize = False,
width = 600, height = 600, title = title_val)
fig.show()
In [39]:
## 構築した LDA モデルを使って文(書)を分類する
## .get_document_topics(..) は minimu_probability = 0としないと
## topic の値が小さい場合に値を返さないので,
## パラメター
ntopics = n_topics # LDA の構築の最に指定した値を使う
check = False
encoding = [ ]
for i, row in df.iterrows():
if check:
print(f"row: {row}")
doc = row[doc_type]
bot = row[term_type]
## get_document_topics(..) では minimu_probability = 0 としないと
## 値が十分に大きな topics に関してだけ値が取れる
enc = lda.get_document_topics(diction.doc2bow(bot), minimum_probability = 0)
if check:
print(f"enc: {enc}")
encoding.append(enc)
#
len(encoding)
Out[39]:
9513
In [40]:
## enc 列の追加
#df['enc'] = np.array(encoding) # This flattens arrays
#df['enc'] = list(encoding) # ineffective
df['enc'] = [ list(map(lambda x: x[1], y)) for y in encoding ]
if verbose:
df['enc']
In [41]:
## エンコーディングのstd の分布を見る
from scipy.stats import tstd
from matplotlib import pyplot as plt
plt.figure(figsize = (6,4))
std_data = [ tstd(x) for x in df['enc'] ]
plt.hist(std_data)
plt.title("Distribution of standard deviations")
plt.show()
In [42]:
## doc のエンコーディング
## 一様分布の事例を除外
from scipy.stats import tstd # standard deviation の計算用
print(f"{len(df)} instances before filtering")
check = False
doc_enc = df['enc']
max_std = max([ tstd(x) for x in doc_enc])
if check: print(f"std max: {max_std}")
min_std = min([ tstd(x) for x in doc_enc])
if check: print(f"std min: {min_std}")
first_min_std = list(sorted(set([ tstd(x) for x in doc_enc])))[-0]
print(f"std 1st min: {first_min_std}")
second_min_std = list(sorted(set([ tstd(x) for x in doc_enc])))[-1]
print(f"std 2nd min: {second_min_std}")
9513 instances before filtering std 1st min: 0.0 std 2nd min: 0.22291233922515472
In [43]:
## df_filtered の定義
## 閾値は2番目に小さい値より小さく最小値よりは大きな値であるべき
std_threshold = second_min_std / 4 # 穏健な値を得るために4で割った
print(f"std_threshold: {std_threshold}")
## Rっぽい次のコードは通らない
#df_filtered = df[ df['encoding'] > std_threshold ]
## 通るのは次のコード: Creating a list of True/False and apply it to DataFrame
std_tested = [ False if tstd(x) < std_threshold else True for x in df['enc'] ]
df_filtered = df[ std_tested ]
#
print(f"{len(df_filtered)} instances after filtering ({len(df) - len(df_filtered)} instances removed)")
std_threshold: 0.05572808480628868 8894 instances after filtering (619 instances removed)
In [44]:
## doc エンコード値の分布を確認
sample_n = 50
E = sorted([ sorted(x, reverse = True) for x in df_filtered['enc'].sample(sample_n) ])
plt.figure(figsize = (5,5))
plt.plot(E, range(len(E)))
plt.title(f"Distribution of sorted encoding values for sampled {sample_n} docs")
plt.show()
In [45]:
len(df_filtered['language'])
Out[45]:
8894
In [46]:
df_filtered['language'].value_counts
Out[46]:
<bound method IndexOpsMixin.value_counts of 361 russian
333 japanese
164 japanese
61 french
30 german
...
671 greek
227 french
595 hungarian
829 french
386 turkish
Name: language, Length: 8894, dtype: object>
In [47]:
## tSNE 用の事例サンプリング = tSNE_df の定義
tSNE_sampling = True
tSNE_sampling_rate = 0.33
if tSNE_sampling:
tSNE_df_original = df_filtered.copy()
sample_n = round(len(tSNE_df_original) * tSNE_sampling_rate)
tSNE_df = tSNE_df_original.sample(sample_n)
print(f"tSNE_df has {len(tSNE_df)} rows after sampling")
else:
tSNE_df = df_filtered
tSNE_df has 2935 rows after sampling
In [48]:
tSNE_df.columns
Out[48]:
Index(['form', 'spell', 'sound', 'freq', 'arabic', 'bengali', 'chinese',
'english', 'esperanto', 'finnish', 'french', 'greek', 'galician',
'german', 'hungarian', 'icelandic', 'irish', 'italian', 'japanese',
'russian', 'spanish', 'swahili', 'turkish', 'size', 'language', '1gram',
'2gram', '3gram', 'skippy2gram', 'skippy3gram', 'enc'],
dtype='object')
In [49]:
tSNE_df['language'].value_counts
Out[49]:
<bound method IndexOpsMixin.value_counts of 344 hungarian
798 french
372 arabic
345 turkish
470 turkish
...
61 german
788 greek
861 arabic
776 chinese
284 russian
Name: language, Length: 2935, dtype: object>
In [50]:
## tSNE の結果の可視化: Plotly を使った 3D 描画
import numpy as np
from sklearn.manifold import TSNE as tSNE
import plotly.express as pex
import plotly.graph_objects as go
import matplotlib.pyplot as plt
## tSNE のパラメターを設定
perplexity_max_val = round(len(tSNE_df)/4)
for perplexity_val in range(5, perplexity_max_val, 60):
## tSNE 事例の生成
tSNE_3d_varied = tSNE(n_components = 3, random_state = 0, perplexity = perplexity_val, n_iter = 1000)
## データに適用
doc_enc = np.array(list(tSNE_df['enc']))
doc_tSNE_3d_varied = tSNE_3d_varied.fit_transform(doc_enc)
T = zip(doc_tSNE_3d_varied[:,0], doc_tSNE_3d_varied[:,1], doc_tSNE_3d_varied[:,2],
tSNE_df['language']) # zip(..)が必要
df = pd.DataFrame(T, columns = ['D1', 'D2', 'D3', 'language'])
## 作図
fig = go.Figure()
for lang in np.unique(df['language']):
part = df[df['language'] == lang]
fig.add_trace(
go.Scatter3d(
x = part['D1'], y = part['D2'], z = part['D3'],
name = lang, mode = 'markers', marker = dict(size = 6),
showlegend = True
)
)
title_val = f"tSNE 3D map (ppl: {perplexity_val}) of '{doc_attr}'s encoded\n by LDA ({n_topics} topics, {term_type})"
fig.update_layout(title = dict(text = title_val),
autosize = False, width = 600, height = 600,)
fig.show()
In [51]:
## 階層クラスタリングのための事例のサンプリング
hc_sampling_rate = 0.1 # 大きくし過ぎると図が見にくい
df_size = len(tSNE_df)
hc_sample_n = round(df_size * hc_sampling_rate)
hc_df = tSNE_df.sample(hc_sample_n)
##
print(f"{hc_sample_n} rows are sampled")
hc_df['language'].value_counts()
294 rows are sampled
Out[51]:
language hungarian 41 russian 39 arabic 35 greek 34 english 32 french 31 japanese 25 german 23 turkish 23 chinese 11 Name: count, dtype: int64
In [58]:
## 日本語表示のための設定
#import matplotlib.pyplot as plt
#plt.rcParams["font.family"] = "Hiragino Sans" # Windows は別のフォント名を指定する必要がある
#plt.rcParams["font.family"] = "Lucida Sans Unicode"
In [59]:
## doc 階層クラスタリングの実行
import numpy as np
import plotly
import matplotlib.pyplot as plt
## 次の設定は arabic が文字化けする
#plt.rcParams["font.family"] = "Hiragino Sans" # Windows は別のフォント名を指定する必要がある
#plt.rcParams["font.family"] = "Lucida Sans Unicode"
from scipy.cluster.hierarchy import dendrogram, linkage
## 距離行列の生成
Enc = list(hc_df['enc'])
linkage = linkage(Enc, method = 'ward', metric = 'euclidean')
## 描画サイズの指定
plt.figure(figsize = (5, round(len(hc_df) * 0.15))) # This needs to be run here, before dendrogram construction.
## 事例ラベルの生成
label_vals = [ x[:max_doc_size] for x in list(hc_df[doc_type]) ] # truncate doc keys
## 樹状分岐図の作成
dendrogram(linkage, orientation = 'left', labels = label_vals, leaf_font_size = 7)
## 描画
plt.title(f"Hierarchical clustering of (sampled) {len(hc_df)} (= {100 * hc_sampling_rate}%) {doc_attr}s as docs\n \
encoded via LDA ({n_topics} topics) with {term_type} as terms")
## ラベルに language に対応する色を付ける
lang_colors = { lang_name : i for i, lang_name in enumerate(np.unique(hc_df['language'])) }
ax = plt.gca()
for ticker in ax.get_ymajorticklabels():
form = ticker.get_text()
row = hc_df.loc[hc_df[doc_type] == form]
#lang = row['language']
lang = row['language'].to_string().split()[-1] # trick
try:
lang_id = lang_colors[lang]
except (TypeError, KeyError):
print(f"color encoding error at: {lang}")
#
ticker.set_color(plotly.colors.qualitative.Plotly[lang_id]) # id の基数調整
#
plt.show()
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12363 (\N{HIRAGANA LETTER KA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12387 (\N{HIRAGANA LETTER SMALL TU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12383 (\N{HIRAGANA LETTER TA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 31169 (\N{CJK UNIFIED IDEOGRAPH-79C1}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12395 (\N{HIRAGANA LETTER NI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 30722 (\N{CJK UNIFIED IDEOGRAPH-7802}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 31958 (\N{CJK UNIFIED IDEOGRAPH-7CD6}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21307 (\N{CJK UNIFIED IDEOGRAPH-533B}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 36889 (\N{CJK UNIFIED IDEOGRAPH-9019}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 19978 (\N{CJK UNIFIED IDEOGRAPH-4E0A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12414 (\N{HIRAGANA LETTER MA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1570 (\N{ARABIC LETTER ALEF WITH MADDA ABOVE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Matplotlib currently does not support Arabic natively.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1582 (\N{ARABIC LETTER KHAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1585 (\N{ARABIC LETTER REH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1605 (\N{ARABIC LETTER MEEM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1606 (\N{ARABIC LETTER NOON}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1581 (\N{ARABIC LETTER HAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1591 (\N{ARABIC LETTER TAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1602 (\N{ARABIC LETTER QAF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12381 (\N{HIRAGANA LETTER SO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12398 (\N{HIRAGANA LETTER NO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1593 (\N{ARABIC LETTER AIN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1583 (\N{ARABIC LETTER DAL}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1601 (\N{ARABIC LETTER FEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12399 (\N{HIRAGANA LETTER HA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1587 (\N{ARABIC LETTER SEEN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1580 (\N{ARABIC LETTER JEEM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1577 (\N{ARABIC LETTER TEH MARBUTA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1610 (\N{ARABIC LETTER YEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1575 (\N{ARABIC LETTER ALEF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1604 (\N{ARABIC LETTER LAM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1603 (\N{ARABIC LETTER KAF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12458 (\N{KATAKANA LETTER O}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12540 (\N{KATAKANA-HIRAGANA PROLONGED SOUND MARK}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12503 (\N{KATAKANA LETTER PU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12531 (\N{KATAKANA LETTER N}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1588 (\N{ARABIC LETTER SHEEN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1569 (\N{ARABIC LETTER HAMZA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1573 (\N{ARABIC LETTER ALEF WITH HAMZA BELOW}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1584 (\N{ARABIC LETTER THAL}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1579 (\N{ARABIC LETTER THEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1578 (\N{ARABIC LETTER TEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12473 (\N{KATAKANA LETTER SU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12486 (\N{KATAKANA LETTER TE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12451 (\N{KATAKANA LETTER SMALL I}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12483 (\N{KATAKANA LETTER SMALL TU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12463 (\N{KATAKANA LETTER KU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1594 (\N{ARABIC LETTER GHAIN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1608 (\N{ARABIC LETTER WAW}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1576 (\N{ARABIC LETTER BEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1590 (\N{ARABIC LETTER DAD}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1589 (\N{ARABIC LETTER SAD}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1607 (\N{ARABIC LETTER HEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1571 (\N{ARABIC LETTER ALEF WITH HAMZA ABOVE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 22855 (\N{CJK UNIFIED IDEOGRAPH-5947}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 24618 (\N{CJK UNIFIED IDEOGRAPH-602A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 36984 (\N{CJK UNIFIED IDEOGRAPH-9078}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12406 (\N{HIRAGANA LETTER BU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 22478 (\N{CJK UNIFIED IDEOGRAPH-57CE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 35430 (\N{CJK UNIFIED IDEOGRAPH-8A66}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12377 (\N{HIRAGANA LETTER SU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12502 (\N{KATAKANA LETTER BU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12521 (\N{KATAKANA LETTER RA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12454 (\N{KATAKANA LETTER U}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21487 (\N{CJK UNIFIED IDEOGRAPH-53EF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 20197 (\N{CJK UNIFIED IDEOGRAPH-4EE5}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21040 (\N{CJK UNIFIED IDEOGRAPH-5230}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 20102 (\N{CJK UNIFIED IDEOGRAPH-4E86}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12506 (\N{KATAKANA LETTER PE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12523 (\N{KATAKANA LETTER RU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12461 (\N{KATAKANA LETTER KI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12450 (\N{KATAKANA LETTER A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12488 (\N{KATAKANA LETTER TO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 36335 (\N{CJK UNIFIED IDEOGRAPH-8DEF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12522 (\N{KATAKANA LETTER RI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12501 (\N{KATAKANA LETTER HU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12471 (\N{KATAKANA LETTER SI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12508 (\N{KATAKANA LETTER BO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 35874 (\N{CJK UNIFIED IDEOGRAPH-8C22}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 25104 (\N{CJK UNIFIED IDEOGRAPH-6210}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21151 (\N{CJK UNIFIED IDEOGRAPH-529F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 20415 (\N{CJK UNIFIED IDEOGRAPH-4FBF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 33391 (\N{CJK UNIFIED IDEOGRAPH-826F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12356 (\N{HIRAGANA LETTER I}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 25126 (\N{CJK UNIFIED IDEOGRAPH-6226}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 29305 (\N{CJK UNIFIED IDEOGRAPH-7279}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 23450 (\N{CJK UNIFIED IDEOGRAPH-5B9A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12367 (\N{HIRAGANA LETTER KU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12388 (\N{HIRAGANA LETTER TU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 38291 (\N{CJK UNIFIED IDEOGRAPH-9593}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12391 (\N{HIRAGANA LETTER DE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 30340 (\N{CJK UNIFIED IDEOGRAPH-7684}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 26178 (\N{CJK UNIFIED IDEOGRAPH-6642}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 20505 (\N{CJK UNIFIED IDEOGRAPH-5019}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12472 (\N{KATAKANA LETTER ZI}) missing from current font.
In [60]:
## tSNE の結果の可視化 (2D)
#import seaborn as sns
import numpy as np
import plotly
import plotly.express as pex
import matplotlib.pyplot as plt
from adjustText import adjust_text
## tSNE 事例の生成
perplexity_selected = 250
tSNE_3d = tSNE(n_components = 3, random_state = 0, perplexity = perplexity_selected, n_iter = 1000)
## データに適用
doc_enc = np.array(list(tSNE_df['enc']))
doc_tSNE_3d = tSNE_3d.fit_transform(doc_enc)
T = zip(doc_tSNE_3d[:,0], doc_tSNE_3d[:,1], doc_tSNE_3d[:,2],
tSNE_df['language']) # zip(..)が必要
df = pd.DataFrame(T, columns = ['D1', 'D2', 'D3', 'language'])
## 描画
## 次の設定は arabic が文字化けする
#plt.rcParams["font.family"] = "Hiragino Sans" # Windows は別のフォント名を指定する必要がある
#plt.rcParams["font.family"] = "Lucida Sans Unicode"
plt.figure(figsize = (5, 5))
plt.set_colors = pex.colors.qualitative.Plotly
for r in [ np.roll([0,1,2], -i) for i in range(0,3) ]:
if check:
print(r)
X, Y = df.iloc[:,r[0]], df.iloc[:,r[1]]
gmax = max(X.max(), Y.max())
gmin = min(X.min(), Y.min())
plt.xlim(gmin, gmax)
plt.ylim(gmin, gmax)
colormap = pex.colors.qualitative.Plotly
lang_list = list(np.unique(tSNE_df['language']))
cmapped = [ colormap[lang_list.index(lang)] for lang in df['language'] ]
scatter = plt.scatter(X, Y, s = 40, c = cmapped, edgecolors = 'w')
## 文字を表示する事例のサンプリング
lab_sampling_rate = 0.02
lab_sample_n = round(len(tSNE_df) * lab_sampling_rate)
sampled_keys = [ doc[:max_doc_size] for doc in random.sample(list(tSNE_df[doc_type]), lab_sample_n) ]
## labels の生成
texts = [ ]
for x, y, s in zip(X, Y, sampled_keys):
texts.append(plt.text(x, y, s, size = 9, color = 'blue'))
## label に repel を追加: adjustText package の導入が必要
adjust_text(texts, force_points = 0.2, force_text = 0.2,
expand_points = (1, 1), expand_text = (1, 1),
arrowprops = dict(arrowstyle = "-", color = 'black', lw = 0.5))
#
plt.title(f"tSNE (ppl: {perplexity_selected}) 2D map of {len(tSNE_df)} {doc_attr}s via LDA ({term_type}; {n_topics} topics)")
#plt.legend(np.unique(cmapped))
plt.show()
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12489 (\N{KATAKANA LETTER DO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12521 (\N{KATAKANA LETTER RA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12452 (\N{KATAKANA LETTER I}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12502 (\N{KATAKANA LETTER BU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 27508 (\N{CJK UNIFIED IDEOGRAPH-6B74}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 21490 (\N{CJK UNIFIED IDEOGRAPH-53F2}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 22909 (\N{CJK UNIFIED IDEOGRAPH-597D}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 21543 (\N{CJK UNIFIED IDEOGRAPH-5427}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1575 (\N{ARABIC LETTER ALEF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Matplotlib currently does not support Arabic natively.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1604 (\N{ARABIC LETTER LAM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1576 (\N{ARABIC LETTER BEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1593 (\N{ARABIC LETTER AIN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1590 (\N{ARABIC LETTER DAD}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12473 (\N{KATAKANA LETTER SU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12486 (\N{KATAKANA LETTER TE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12451 (\N{KATAKANA LETTER SMALL I}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12483 (\N{KATAKANA LETTER SMALL TU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12463 (\N{KATAKANA LETTER KU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12456 (\N{KATAKANA LETTER E}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12531 (\N{KATAKANA LETTER N}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12472 (\N{KATAKANA LETTER ZI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 24863 (\N{CJK UNIFIED IDEOGRAPH-611F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 35226 (\N{CJK UNIFIED IDEOGRAPH-899A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1587 (\N{ARABIC LETTER SEEN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1577 (\N{ARABIC LETTER TEH MARBUTA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12467 (\N{KATAKANA LETTER KO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12512 (\N{KATAKANA LETTER MU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12461 (\N{KATAKANA LETTER KI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 26041 (\N{CJK UNIFIED IDEOGRAPH-65B9}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1602 (\N{ARABIC LETTER QAF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1591 (\N{ARABIC LETTER TAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1606 (\N{ARABIC LETTER NOON}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 26360 (\N{CJK UNIFIED IDEOGRAPH-66F8}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12363 (\N{HIRAGANA LETTER KA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12428 (\N{HIRAGANA LETTER RE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12383 (\N{HIRAGANA LETTER TA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12525 (\N{KATAKANA LETTER RO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12464 (\N{KATAKANA LETTER GU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12489 (\N{KATAKANA LETTER DO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12521 (\N{KATAKANA LETTER RA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12452 (\N{KATAKANA LETTER I}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12502 (\N{KATAKANA LETTER BU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 27508 (\N{CJK UNIFIED IDEOGRAPH-6B74}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21490 (\N{CJK UNIFIED IDEOGRAPH-53F2}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 22909 (\N{CJK UNIFIED IDEOGRAPH-597D}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21543 (\N{CJK UNIFIED IDEOGRAPH-5427}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1575 (\N{ARABIC LETTER ALEF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Matplotlib currently does not support Arabic natively.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1604 (\N{ARABIC LETTER LAM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1576 (\N{ARABIC LETTER BEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1593 (\N{ARABIC LETTER AIN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1590 (\N{ARABIC LETTER DAD}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12473 (\N{KATAKANA LETTER SU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12486 (\N{KATAKANA LETTER TE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12451 (\N{KATAKANA LETTER SMALL I}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12483 (\N{KATAKANA LETTER SMALL TU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12463 (\N{KATAKANA LETTER KU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12456 (\N{KATAKANA LETTER E}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12531 (\N{KATAKANA LETTER N}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12472 (\N{KATAKANA LETTER ZI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 24863 (\N{CJK UNIFIED IDEOGRAPH-611F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 35226 (\N{CJK UNIFIED IDEOGRAPH-899A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1587 (\N{ARABIC LETTER SEEN}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1577 (\N{ARABIC LETTER TEH MARBUTA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12467 (\N{KATAKANA LETTER KO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12512 (\N{KATAKANA LETTER MU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12461 (\N{KATAKANA LETTER KI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 26041 (\N{CJK UNIFIED IDEOGRAPH-65B9}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1602 (\N{ARABIC LETTER QAF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1591 (\N{ARABIC LETTER TAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1606 (\N{ARABIC LETTER NOON}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 26360 (\N{CJK UNIFIED IDEOGRAPH-66F8}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12363 (\N{HIRAGANA LETTER KA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12428 (\N{HIRAGANA LETTER RE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12383 (\N{HIRAGANA LETTER TA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12525 (\N{KATAKANA LETTER RO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12464 (\N{KATAKANA LETTER GU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12459 (\N{KATAKANA LETTER KA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12454 (\N{KATAKANA LETTER U}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12488 (\N{KATAKANA LETTER TO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 24040 (\N{CJK UNIFIED IDEOGRAPH-5DE8}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 22823 (\N{CJK UNIFIED IDEOGRAPH-5927}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12394 (\N{HIRAGANA LETTER NA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12524 (\N{KATAKANA LETTER RE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 38431 (\N{CJK UNIFIED IDEOGRAPH-961F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1585 (\N{ARABIC LETTER REH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1610 (\N{ARABIC LETTER YEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12506 (\N{KATAKANA LETTER PE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12523 (\N{KATAKANA LETTER RU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 20804 (\N{CJK UNIFIED IDEOGRAPH-5144}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 24351 (\N{CJK UNIFIED IDEOGRAPH-5F1F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 25152 (\N{CJK UNIFIED IDEOGRAPH-6240}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 26377 (\N{CJK UNIFIED IDEOGRAPH-6709}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 29305 (\N{CJK UNIFIED IDEOGRAPH-7279}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 23450 (\N{CJK UNIFIED IDEOGRAPH-5B9A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12398 (\N{HIRAGANA LETTER NO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1603 (\N{ARABIC LETTER KAF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1592 (\N{ARABIC LETTER ZAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1605 (\N{ARABIC LETTER MEEM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12391 (\N{HIRAGANA LETTER DE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12365 (\N{HIRAGANA LETTER KI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12427 (\N{HIRAGANA LETTER RU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 27491 (\N{CJK UNIFIED IDEOGRAPH-6B63}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 22312 (\N{CJK UNIFIED IDEOGRAPH-5728}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 22478 (\N{CJK UNIFIED IDEOGRAPH-57CE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1583 (\N{ARABIC LETTER DAL}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1608 (\N{ARABIC LETTER WAW}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12522 (\N{KATAKANA LETTER RI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12540 (\N{KATAKANA-HIRAGANA PROLONGED SOUND MARK}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12481 (\N{KATAKANA LETTER TI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 27875 (\N{CJK UNIFIED IDEOGRAPH-6CE3}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12367 (\N{HIRAGANA LETTER KU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1581 (\N{ARABIC LETTER HAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12505 (\N{KATAKANA LETTER BE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 23601 (\N{CJK UNIFIED IDEOGRAPH-5C31}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 26159 (\N{CJK UNIFIED IDEOGRAPH-662F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 20123 (\N{CJK UNIFIED IDEOGRAPH-4E9B}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12459 (\N{KATAKANA LETTER KA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12454 (\N{KATAKANA LETTER U}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12488 (\N{KATAKANA LETTER TO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 24040 (\N{CJK UNIFIED IDEOGRAPH-5DE8}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 22823 (\N{CJK UNIFIED IDEOGRAPH-5927}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12394 (\N{HIRAGANA LETTER NA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12524 (\N{KATAKANA LETTER RE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 38431 (\N{CJK UNIFIED IDEOGRAPH-961F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1585 (\N{ARABIC LETTER REH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1610 (\N{ARABIC LETTER YEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12506 (\N{KATAKANA LETTER PE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12523 (\N{KATAKANA LETTER RU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 20804 (\N{CJK UNIFIED IDEOGRAPH-5144}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 24351 (\N{CJK UNIFIED IDEOGRAPH-5F1F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 25152 (\N{CJK UNIFIED IDEOGRAPH-6240}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 26377 (\N{CJK UNIFIED IDEOGRAPH-6709}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 29305 (\N{CJK UNIFIED IDEOGRAPH-7279}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 23450 (\N{CJK UNIFIED IDEOGRAPH-5B9A}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12398 (\N{HIRAGANA LETTER NO}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1603 (\N{ARABIC LETTER KAF}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1592 (\N{ARABIC LETTER ZAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1605 (\N{ARABIC LETTER MEEM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12391 (\N{HIRAGANA LETTER DE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12365 (\N{HIRAGANA LETTER KI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12427 (\N{HIRAGANA LETTER RU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 27491 (\N{CJK UNIFIED IDEOGRAPH-6B63}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 22312 (\N{CJK UNIFIED IDEOGRAPH-5728}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 22478 (\N{CJK UNIFIED IDEOGRAPH-57CE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1583 (\N{ARABIC LETTER DAL}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1608 (\N{ARABIC LETTER WAW}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12522 (\N{KATAKANA LETTER RI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12540 (\N{KATAKANA-HIRAGANA PROLONGED SOUND MARK}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12481 (\N{KATAKANA LETTER TI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 27875 (\N{CJK UNIFIED IDEOGRAPH-6CE3}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12367 (\N{HIRAGANA LETTER KU}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1581 (\N{ARABIC LETTER HAH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12505 (\N{KATAKANA LETTER BE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 23601 (\N{CJK UNIFIED IDEOGRAPH-5C31}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 26159 (\N{CJK UNIFIED IDEOGRAPH-662F}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 20123 (\N{CJK UNIFIED IDEOGRAPH-4E9B}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 25915 (\N{CJK UNIFIED IDEOGRAPH-653B}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1589 (\N{ARABIC LETTER SAD}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 19981 (\N{CJK UNIFIED IDEOGRAPH-4E0D}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 24605 (\N{CJK UNIFIED IDEOGRAPH-601D}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 35696 (\N{CJK UNIFIED IDEOGRAPH-8B70}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 27604 (\N{CJK UNIFIED IDEOGRAPH-6BD4}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1601 (\N{ARABIC LETTER FEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12471 (\N{KATAKANA LETTER SI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12515 (\N{KATAKANA LETTER SMALL YA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1578 (\N{ARABIC LETTER TEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 19990 (\N{CJK UNIFIED IDEOGRAPH-4E16}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 30028 (\N{CJK UNIFIED IDEOGRAPH-754C}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1580 (\N{ARABIC LETTER JEEM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 12496 (\N{KATAKANA LETTER BA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 1574 (\N{ARABIC LETTER YEH WITH HAMZA ABOVE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 32473 (\N{CJK UNIFIED IDEOGRAPH-7ED9}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 21015 (\N{CJK UNIFIED IDEOGRAPH-5217}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/adjustText/__init__.py:564: UserWarning:
Glyph 36554 (\N{CJK UNIFIED IDEOGRAPH-8ECA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 25915 (\N{CJK UNIFIED IDEOGRAPH-653B}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1589 (\N{ARABIC LETTER SAD}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 19981 (\N{CJK UNIFIED IDEOGRAPH-4E0D}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 24605 (\N{CJK UNIFIED IDEOGRAPH-601D}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 35696 (\N{CJK UNIFIED IDEOGRAPH-8B70}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 27604 (\N{CJK UNIFIED IDEOGRAPH-6BD4}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1601 (\N{ARABIC LETTER FEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12471 (\N{KATAKANA LETTER SI}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12515 (\N{KATAKANA LETTER SMALL YA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1578 (\N{ARABIC LETTER TEH}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 19990 (\N{CJK UNIFIED IDEOGRAPH-4E16}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 30028 (\N{CJK UNIFIED IDEOGRAPH-754C}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1580 (\N{ARABIC LETTER JEEM}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 12496 (\N{KATAKANA LETTER BA}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 1574 (\N{ARABIC LETTER YEH WITH HAMZA ABOVE}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 32473 (\N{CJK UNIFIED IDEOGRAPH-7ED9}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 21015 (\N{CJK UNIFIED IDEOGRAPH-5217}) missing from current font.
/Volumes/K/opt/miniconda3/lib/python3.10/site-packages/IPython/core/pylabtools.py:152: UserWarning:
Glyph 36554 (\N{CJK UNIFIED IDEOGRAPH-8ECA}) missing from current font.
In [ ]: